library(readxl)
library(DESeq2)
library(dplyr)
library(ggplot2)
library(ggrepel)
library(ggpubr)
library(writexl)
library(sva)

#setwd("/mnt/sdd2/data/Utrecht/Longitudinal_Data")
                                                                                                          
##read raw data
counts_file=read.csv("Longitudinal_deseq2_raw_counts.csv",row.names = 1)

##read metadata including sample name, patient/subject ID, timepoint, batch information
layout=data.frame(read_xlsx("All_paired_layout.xlsx"))
layout$Timepoint=factor(layout$Timepoint, levels=as.character(unique(layout$Timepoint)))
layout$Subject=factor(layout$Subject, levels=as.character(unique(layout$Subject)))
rownames(layout)=layout$Sample
layout$Sample=NULL

##prepare data for DESeq2 analysis
colOrder =(rownames(layout))
counts_file = counts_file[,colOrder]
data= as.data.frame(apply(counts_file, 2, as.numeric))
rownames(data)=rownames(counts_file)
data=as.matrix(data)

batch_corrected_data = ComBat_seq(data,  batch = layout$Seq_Batch,  group = layout$Timepoint)


# table(layout$Seq_Batch, layout$Subject)
# table(layout$Subject, layout$Timepoint)
# table(layout$Seq_Batch, layout$Timepoint)
#certain batch-subject-timepoint combinations 
#were not observed in the experiment, which prevents the statistical model from estimating their effects.

##Paired DESeq2 analysis
dds = DESeqDataSetFromMatrix(countData=batch_corrected_data, layout, formula(~ Subject+Timepoint))
##filter rows with less than 10 counts in 20% samples (23 out of 115 samples)
keep = rowSums(counts(dds) >= 10) >= 23
dds = dds[keep,]
dds = DESeq(dds)

# Nomralised counts 
normalised_counts=counts(dds,normalized=TRUE)
un_nomralised_counts=counts(dds,normalized=FALSE)

# Transform counts for PCA (variance stabilizing transformation)
vsd = varianceStabilizingTransformation(dds, blind = FALSE)

# Extract PCA data
pcaData = plotPCA(vsd, intgroup = c("Timepoint", "Seq_Batch"), returnData = TRUE)

# Calculate percent variance
percentVar = round(100 * attr(pcaData, "percentVar"))

# Make PCA plot using ggplot2
PCA_plot=ggplot(pcaData, aes(x = PC1, y = PC2, color = Timepoint, shape = Seq_Batch)) +
  geom_point(size = 4, alpha = 0.8) +
  theme_bw(base_size = 14) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  theme(
    panel.grid = element_blank(),
    legend.position = "right"
  )
print(PCA_plot)
ggsave(filename = "Supplementary figure 1.png",PCA_plot,height=6,width=6)

## T3 vs T1
res_TP_3_vs_1 = results(dds, contrast = c("Timepoint","T3","T1"), cooksCutoff=FALSE)
results_TP_3_vs_1 = data.frame(res_TP_3_vs_1)
res_TP_3_vs_1 = res_TP_3_vs_1[order(res_TP_3_vs_1$log2FoldChange, decreasing=TRUE),]
results_TP_3_vs_1 = results_TP_3_vs_1 %>% arrange(padj)

results_df_TP_3_vs_1=merge(results_TP_3_vs_1,normalised_counts,by="row.names")
results_df_TP_3_vs_1$group = "Not Significant"
results_df_TP_3_vs_1$group[results_df_TP_3_vs_1$log2FoldChange > 0.5 & results_df_TP_3_vs_1$padj < 0.05] = "Upregulated"
results_df_TP_3_vs_1$group[results_df_TP_3_vs_1$log2FoldChange < -0.5 & results_df_TP_3_vs_1$padj < 0.05] = "Downregulated"
to_write=results_df_TP_3_vs_1 %>% select(Row.names,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj)
write_xlsx(to_write,"Supplementary table 2.xlsx")
#plot volcano plot for T3 vs T1
volcano_plot = ggplot(results_df_TP_3_vs_1, aes(x = log2FoldChange, y = -log10(padj), color = group)) +
  geom_point(alpha = 0.7, size = 2) +
  scale_color_manual(values = c("Upregulated" = "red",
                                "Downregulated" = "blue",
                                "Not Significant" = "grey")) +
  geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", color = "black") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +
  geom_text_repel(
    data = subset(results_df_TP_3_vs_1, abs(log2FoldChange) > 0.5 & padj < 0.05),
    aes(label = Row.names),  
    size = 5,
    box.padding = 0.5,
    point.padding = 0.2,
    max.overlaps = 1,
    segment.color = "black",
    segment.size = 0.4,
  )+
  labs(
    title = "T3 vs T1 sncRNA",
    x = "log2 Fold Change",
    y = "-log10(adjusted p-value)"
  ) +
  theme_bw(base_size = 16) +
  theme(
    plot.title = element_text(hjust = 0.5),
    legend.position = "bottom"
  )
print(volcano_plot)
ggsave(volcano_plot,filename="tsRNAsearch_VP.png",height=6.5,width=6.5)
res_TP_3_vs_1["GlyGCC",]
#T2 vs T1
res_TP_2_vs_1 = results(dds, contrast = c("Timepoint","T2","T1"), cooksCutoff=FALSE)
results_df_TP_2_vs_1=merge(res_TP_2_vs_1,normalised_counts,by="row.names")
results_df_TP_2_vs_1$group = "Not Significant"
results_df_TP_2_vs_1$group[results_df_TP_2_vs_1$log2FoldChange > 0.5 & results_df_TP_2_vs_1$padj < 0.05] = "Upregulated"
results_df_TP_2_vs_1$group[results_df_TP_2_vs_1$log2FoldChange < -0.5 & results_df_TP_2_vs_1$padj < 0.05] = "Downregulated"

#T3 vs T2
res_TP_3_vs_2 = results(dds, contrast = c("Timepoint","T3","T2"), cooksCutoff=FALSE)
results_df_TP_3_vs_2=merge(res_TP_3_vs_2,normalised_counts,by="row.names")
results_df_TP_3_vs_2$group = "Not Significant"
results_df_TP_3_vs_2$group[results_df_TP_3_vs_2$log2FoldChange > 0.5 & results_df_TP_3_vs_2$padj < 0.05] = "Upregulated"
results_df_TP_3_vs_2$group[results_df_TP_3_vs_2$log2FoldChange < -0.5 & results_df_TP_3_vs_2$padj < 0.05] = "Downregulated"

#extract and format GlyGCC information
GlyGCC=data.frame(normalised_counts["GlyGCC",])
colnames(GlyGCC)=c("GlyGCC")
GlyGCC_data=merge(GlyGCC, layout,by=0)

#remove outliers from each timepoint
GlyGCC_df_clean = GlyGCC_data %>%
  group_by(Timepoint) %>%
  filter(
    GlyGCC >= (quantile(GlyGCC, 0.25) - 1.5 * IQR(GlyGCC)) &
      GlyGCC <= (quantile(GlyGCC, 0.75) + 1.5 * IQR(GlyGCC))
  ) %>%
  ungroup()
#number of samples at each timepoint after outlier removal
table(GlyGCC_df_clean$Timepoint)

# get DESeq2 stats for each comparison
df_stats = data.frame(
  group1 = "T3",
  group2 = "T1",
  p.adj = results_TP_3_vs_1["GlyGCC","padj"],
  y.position = max(GlyGCC_df_clean$GlyGCC) * 1.08,  # position of bracket
  label = paste0("padj = ",sprintf("%.6f", signif(results_TP_3_vs_1["GlyGCC","padj"], 2)) )
)

T2_vs_T1=data.frame(group1 = "T2",
                             group2 = "T1",
                             p.adj = res_TP_2_vs_1["GlyGCC","padj"],
                             y.position = max(GlyGCC_df_clean$GlyGCC) * 1.02,  # position of bracket
                             label = paste0("padj = ", signif(res_TP_2_vs_1["GlyGCC","padj"], 2)))

T3_vs_T2=data.frame(group1 = "T3",
                             group2 = "T2",
                             p.adj = res_TP_3_vs_2["GlyGCC","padj"],
                             y.position = max(GlyGCC_df_clean$GlyGCC) * 1.02,  # position of bracket
                             label = paste0("padj = ", signif(res_TP_3_vs_2["GlyGCC","padj"], 2)))

df_stats=rbind(df_stats, T2_vs_T1,T3_vs_T2)

#plot boxplots with LOESS curve and subject-wise GlyGCC expression
GlyGCC_filtered=ggplot(GlyGCC_df_clean, aes(x = factor(Timepoint), y = GlyGCC)) +
  geom_boxplot(aes(group = factor(Timepoint), fill = factor(Timepoint)), alpha = 0.5)  + 
  geom_line(aes(group = Subject, color = Subject), alpha = 0.4) +               
  geom_point(aes(color = Subject), alpha = 0.4) +
  scale_fill_manual(values = c("T1" = "lightgreen", "T2" = "#E69F00", "T3" = "tomato"))+
  geom_smooth(aes(group = 1), method = "loess", color = "black", se = TRUE) +  
  labs(
    title = "Patient wise trajectory of GlyGCC",
    x = "Timepoint",
    y = "Normalised counts"
  ) +
  theme_bw(base_size = 16) +
  theme(
    plot.title = element_text(hjust = 0.5),
    legend.position = "none",
    axis.text.x = element_text(size = 14),
    axis.text.y = element_text(size = 14)
  )+
  stat_pvalue_manual(
    df_stats,
    label = "label", 
    xmin = "group1",
    xmax = "group2",
    y.position = "y.position",
    tip.length = 0.02,
    size = 6
  )
GlyGCC_filtered
ggsave(GlyGCC_filtered,filename="tsRNAsearch_GlyGCC_box_plot.png",height=6.5,width=6)

#######

##Plotting GlyGCC qPCR data

qPCR_discovery=read_xlsx("qPCR and ELSIA.xlsx",sheet="GlyGCC discovery cohort")
qPCR_discovery_final= qPCR_discovery[complete.cases(qPCR_discovery), ]

qPCR_discovery_final= qPCR_discovery_final%>%
  mutate(
    Timepoint = as.factor(Timepoint)   # ensure timepoint is a factor too
  )
table(qPCR_discovery_final$Timepoint)
# 2. Boxplot + spaghetti plot
DC_GlyGCC= ggplot(qPCR_discovery_final, aes(x = Timepoint, y = log_Foldchange, fill=Timepoint)) +
  geom_boxplot() +  
  theme_bw(base_size = 16) +
  labs(
    title = "Discovery",
    x = "Timepoint",
    y = "tDR-1:34-Gly-GCC (Log foldchange to T1)"
  ) +
  theme(legend.position = "none",
        axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14),
        plot.title = element_text(hjust = 0.5) ,
        panel.grid = element_blank(),
        panel.border = element_rect(color = "black", fill = NA),
        axis.line = element_line(color = "black")) + 
scale_fill_manual(values=c("lightgreen", "#E69F00", "tomato")) 
DC_GlyGCC
ggsave(DC_GlyGCC,filename="DC_GlyGCC_box_plot.png",height=6.5,width=6)

qPCR_validation=read_xlsx("qPCR and ELSIA.xlsx",sheet="GlyGCC validation cohort")
qPCR_validation_final= qPCR_validation[complete.cases(qPCR_validation), ]
qPCR_validation_final= qPCR_validation_final%>%
  mutate(
    Timepoint = as.factor(Timepoint)   # ensure timepoint is a factor too
  )
table(qPCR_validation_final$Timepoint)
# 2. Boxplot + spaghetti plot
VC_GlyGCC= ggplot(qPCR_validation_final, aes(x = Timepoint, y = log_Foldchange, fill=Timepoint)) +
  geom_boxplot() +  
  theme_bw(base_size = 16) +
  labs(
    title = "Replication",
    x = "Timepoint",
    y = "tDR-1:34-Gly-GCC (Log foldchange to T1)"
  ) +
  theme(legend.position = "none",
        axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14),
        plot.title = element_text(hjust = 0.5) ,
        panel.grid = element_blank(),
        panel.border = element_rect(color = "black", fill = NA),
        axis.line = element_line(color = "black")
  ) + 
  scale_fill_manual(values=c("lightgreen", "#E69F00")) 
VC_GlyGCC
ggsave(VC_GlyGCC,filename="VC_GlyGCC_box_plot.png",height=6.5,width=5)

#######
## ANG plots
ANG_discovery_cohort=read_xlsx("qPCR and ELSIA.xlsx",sheet="ANG discovery cohort")
ANG_discovery_cohort_full= ANG_discovery_cohort[complete.cases(ANG_discovery_cohort), ]
table(ANG_discovery_cohort_full$Time)
ANG_dis = ggplot(ANG_discovery_cohort_full, aes(x=Time, y=ANG, fill=Time)) + 
  geom_violin() +
  scale_fill_manual(values=c("lightgreen", "#E69F00", "tomato")) +
  geom_boxplot(width=0.25) +
  theme_bw(base_size = 16) +
  labs(
    title = "Discovery",
    x = "Timepoint",
    y = "ANG (ng/ml)"
  ) +
  # Append 'T' to the x-axis labels
  scale_x_discrete(labels = function(x) paste0("T", x)) +
  theme(legend.position = "none",
        axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14),
        plot.title = element_text(hjust = 0.5) ,
        panel.grid = element_blank(),
        panel.border = element_rect(color = "black", fill = NA),
        axis.line = element_line(color = "black")
  ) +
  # Remove the legend
  guides(fill="none")

ANG_dis

ggsave(ANG_dis,filename="ANG_discovery_violin.png",height=6.5,width=6)


ANG_validation_cohort=read_xlsx("qPCR and ELSIA.xlsx",sheet="ANG validation cohort")
ANG_validation_cohort_full= ANG_validation_cohort[complete.cases(ANG_validation_cohort), ]
table(ANG_validation_cohort_full$Time)
ANG_val = ggplot(ANG_validation_cohort_full, aes(x=Time, y=ANG, fill=Time)) + 
  geom_violin() +
  scale_fill_manual(values=c("lightgreen", "#E69F00")) +
  geom_boxplot(width=0.25) +
  theme_bw(base_size = 16) +
  labs(
    title = "Replication",
    x = "Timepoint",
    y = "ANG (ng/ml)"
  )+
  
  # Append 'T' to the x-axis labels
  scale_x_discrete(labels = function(x) paste0("T", x)) +
  
  # Make axis titles bold
  theme(
    legend.position = "none",
    axis.text.x = element_text(size = 14),
    axis.text.y = element_text(size = 14),
    plot.title = element_text(hjust = 0.5) ,
    panel.grid = element_blank(),
    panel.border = element_rect(color = "black", fill = NA),
    axis.line = element_line(color = "black")
  ) +
  
  # Remove the legend
  guides(fill="none")

ANG_val

ggsave(ANG_val,filename="ANG_replication_violin.png",height=6.5,width=6)


